Load packages:

library(tidyverse)
library(labelled)

Resources:

1 Generate random distributions

num_obs <- 10000

# Generate standard normal distribution (default is mean = 0 and sd = 1)
set.seed(124)
stdnorm_dist <- rnorm(num_obs, mean = 0, sd = 1)  # equivalent to rnorm(10)

# Generate normal distribution w/ custom mean and sd
set.seed(124)
norm_dist <- rnorm(num_obs, mean = 50, sd = 5)

# Generate left-skewed distribution
set.seed(124)
lskew_dist <- rbeta(num_obs, 5, 2)

# Generate right-skewed distribution
set.seed(124)
rskew_dist <- rbeta(num_obs, 2, 5)

# Create dataframe
df_generated_pop <- data.frame(stdnorm_dist, norm_dist, lskew_dist, rskew_dist)
load(file = url('https://github.com/anyone-can-cook/educ152/raw/main/data/ipeds/output_data/panel_data.RData'))

# Create ipeds data frame with fewer variables/observations
df_ipeds_pop <- panel_data %>%
  # keep IPEDS tuition data from fall of which year (e.g., fall 2016 is price for programs in 2016-17 academic year)
  filter(year == 2016) %>%
  # which universities to keep:
    # 2015 carnegie classification: keep research universities (15,16,17) and master's universities (18,19,20)
  filter(c15basic %in% c(15,16,17,18,19,20)) %>%
  # which variables to keep
  select(instnm,unitid,opeid6,opeid,control,c15basic,stabbr,city,zip,locale,obereg, # basic institutional characteristics
         tuition6,fee6,tuition7,fee7, # avg tuition and fees for full-time grad, in-state and out-of-state
         isprof3,ispfee3,osprof3,ospfee3, # avg tuition and fees for MD, in-state and out-of-state
         isprof9,ispfee9,osprof9,ospfee9, # avg tuition and fees for Law, in-state and out-of-state
         chg4ay3,chg7ay3,chg8ay3) %>% # [undergraduate] books+supplies; off-campus (not with family) room and board; off-campus (not with family) other expenses
  # rename variables; syntax <new_name> = <old_name>
  rename(region = obereg, # revion
         tuit_grad_res = tuition6, fee_grad_res = fee6, tuit_grad_nres = tuition7, fee_grad_nres = fee7, # grad
         tuit_md_res = isprof3, fee_md_res = ispfee3, tuit_md_nres = osprof3, fee_md_nres = ospfee3, # md
         tuit_law_res = isprof9, fee_law_res = ispfee9, tuit_law_nres = osprof9, fee_law_nres = ospfee9, # law
         books_supplies = chg4ay3, roomboard_off = chg7ay3, oth_expense_off = chg8ay3) %>% # [undergraduate] expenses
  # create measures of tuition+fees
  mutate(
    tuitfee_grad_res = tuit_grad_res + fee_grad_res, # graduate, state resident
    tuitfee_grad_nres = tuit_grad_nres + fee_grad_nres, # graduate, non-resident
    tuitfee_md_res = tuit_md_res + fee_md_res, # MD, state resident
    tuitfee_md_nres = tuit_md_nres + fee_md_nres, # MD, non-resident
    tuitfee_law_res = tuit_law_res + fee_law_res, # Law, state resident
    tuitfee_law_nres = tuit_law_nres + fee_law_nres) %>% # Law, non-resident  
  # create measures of cost-of-attendance (COA) as the sum of tuition, fees, book, living expenses
  mutate(
    coa_grad_res = tuit_grad_res + fee_grad_res + books_supplies + roomboard_off + oth_expense_off, # graduate, state resident
    coa_grad_nres = tuit_grad_nres + fee_grad_nres + books_supplies + roomboard_off + oth_expense_off, # graduate, non-resident
    coa_md_res = tuit_md_res + fee_md_res + books_supplies + roomboard_off + oth_expense_off, # MD, state resident
    coa_md_nres = tuit_md_nres + fee_md_nres + books_supplies + roomboard_off + oth_expense_off, # MD, non-resident
    coa_law_res = tuit_law_res + fee_law_res + books_supplies + roomboard_off + oth_expense_off, # Law, state resident
    coa_law_nres = tuit_law_nres + fee_law_nres + books_supplies + roomboard_off + oth_expense_off) # %>% # Law, non-resident    
  # [COMMENTED THIS OUT] keep only observations that have non-missing values for the variable coa_grad_res
    # this does cause us to lose some interesting universities, but doing this will eliminate some needless complications with respect to learning core concepts about statistical inference
  #filter(!is.na(coa_grad_res))

# Add variable labels to the tuit+fees variables and coa variables
  # tuition + fees variables
    var_label(df_ipeds_pop[['tuitfee_grad_res']]) <- 'graduate, full-time, resident; avg tuition + required fees'
    var_label(df_ipeds_pop[['tuitfee_grad_nres']]) <- 'graduate, full-time, non-resident; avg tuition + required fees'
    var_label(df_ipeds_pop[['tuitfee_md_res']]) <- 'MD, full-time, state resident; avg tuition + required fees'
    var_label(df_ipeds_pop[['tuitfee_md_nres']]) <- 'MD, full-time, non-resident; avg tuition + required fees'
    var_label(df_ipeds_pop[['tuitfee_law_res']]) <- 'Law, full-time, state resident; avg tuition + required fees'
    var_label(df_ipeds_pop[['tuitfee_law_nres']]) <- 'Law, full-time, non-resident; avg tuition + required fees'
    
  # COA variables
    var_label(df_ipeds_pop[['coa_grad_res']]) <- 'graduate, full-time, state resident COA; == tuition + fees + (ug) books/supplies + (ug) off-campus room and board + (ug) off-campus other expenses'
    var_label(df_ipeds_pop[['coa_grad_nres']]) <- 'graduate, full-time, non-resident COA; == tuition + fees + (ug) books/supplies + (ug) off-campus room and board + (ug) off-campus other expenses'
    var_label(df_ipeds_pop[['coa_md_res']]) <- 'MD, full-time, state resident COA; == tuition + fees + (ug) books/supplies + (ug) off-campus room and board + (ug) off-campus other expenses'
    var_label(df_ipeds_pop[['coa_md_nres']]) <- 'MD, full-time, non-resident COA; == tuition + fees + (ug) books/supplies + (ug) off-campus room and board + (ug) off-campus other expenses'
    var_label(df_ipeds_pop[['coa_law_res']]) <- 'Law, full-time, state resident COA; == tuition + fees + (ug) books/supplies + (ug) off-campus room and board + (ug) off-campus other expenses'
    var_label(df_ipeds_pop[['coa_law_nres']]) <- 'Law, full-time, non-resident COA; == tuition + fees + (ug) books/supplies + (ug) off-campus room and board + (ug) off-campus other expenses'

  #df_ipeds_pop %>% glimpse()

load(file = url('https://github.com/anyone-can-cook/educ152/raw/main/data/college_scorecard/output_data/df_debt_earn_panel_labelled.RData'))

df_scorecard <- df_debt_earn_panel_labelled %>%
    # keep most recent year of data
    filter(field_ay == '2017-18') %>%
    # keep master's degrees
    filter(credlev == 5) %>%
    # carnegie categories to keep: 15 = Doctoral Universities: Very High Research Activity; 16 = Doctoral Universities: High Research Activity
      # note: variable ccbasic from scorecard data is 2015 carnegie classification
    filter(ccbasic %in% c(15,16,17,18,19,20)) %>%
    # drop "parent plus" loan variables and other vars we won't use in this lecture
    select(-contains('_pp'),-contains('_any'),-field_ay,-st_fips,-zip,-longitude,-latitude,-locale2,-highdeg,-accredagency,-relaffil,-hbcu,-annhi,-tribal,-aanapii,-hsi,-nanti,-main,-numbranch,-control) %>%
    # create variable for broad field of degree (e.g., education, business)
    mutate(cipdig2 = str_sub(string = cipcode, start = 1, end = 2)) %>%
    # shorten variable cipdesc to make it more suitable for printing
    mutate(cipdesc = str_sub(string = cipdesc, start = 1, end = 50)) %>%
    # re-order variables
    relocate(opeid6,unitid,instnm,ccbasic,stabbr,city,cipdig2)

# For debt and earnings variables, convert from character to numeric variables (which replaces "PrivacySuppressed" values with NA values)
df_scorecard <- df_scorecard %>%
  mutate(
    debt_all_stgp_eval_n = as.numeric(debt_all_stgp_eval_n),
    debt_all_stgp_eval_mean = as.numeric(debt_all_stgp_eval_mean),
    debt_all_stgp_eval_mdn = as.numeric(debt_all_stgp_eval_mdn),
    debt_all_stgp_eval_mdn10yrpay = as.numeric(debt_all_stgp_eval_mdn10yrpay),
    earn_count_wne_hi_1yr = as.numeric(earn_count_wne_hi_1yr),
    earn_mdn_hi_1yr = as.numeric(earn_mdn_hi_1yr),
    earn_count_wne_hi_2yr = as.numeric(earn_count_wne_hi_2yr),
    earn_mdn_hi_2yr = as.numeric(earn_mdn_hi_2yr)
  ) 

# add variable label to variable cipdig2
  attr(df_scorecard[['cipdig2']], which = 'label') <- 'broad degree field code = 2-digit classification of instructional programs (CIP) degree code'

# add variable label attribute back to debt and earnings variables
  for(v in c('debt_all_stgp_eval_n','debt_all_stgp_eval_mean','debt_all_stgp_eval_mdn','debt_all_stgp_eval_mdn10yrpay','earn_count_wne_hi_1yr','earn_mdn_hi_1yr','earn_count_wne_hi_2yr','earn_mdn_hi_2yr','cipdesc')) {
    attr(df_scorecard[[v]], which = 'label') <- attr(df_debt_earn_panel_labelled[[v]], which = 'label')
  }

  rm(df_debt_earn_panel_labelled) # comment this line out if you want to keep data frame

  df_score_ipeds <- df_ipeds_pop %>% 
    select(-instnm,-opeid6,-opeid,-c15basic,-region,-locale,-city,-stabbr,-zip) %>% mutate(one=1) %>%
    right_join(y=df_scorecard, by = 'unitid')

  df_score_ipeds <- df_score_ipeds %>% 
    # drop unitids from scorecard that don't merge to ipeds data (on tuition)
    filter(!is.na(one)) %>% 
    # drop observations that don't have mean debt data
    filter(!is.na(debt_all_stgp_eval_mean)) %>% 
    # drop for-profits
    filter(unclass(control) !=3) %>%
    # drop tuition/coa vars for law and md
    select(-one,-contains('law'),-contains('md_')) %>%
    relocate(opeid6,unitid,instnm,control,ccbasic,stabbr,region,city,locale,cipdig2,cipcode,cipdesc,credlev,creddesc,
      contains('ipeds'),starts_with('debt'),starts_with('earn'))

# create data frame that only contains observations for MAs in social work; filter(cipcode=='4407')
  df_socialwork <- df_score_ipeds %>% 
    filter(cipcode=='4407') %>%
    # remove observations with missing values of cost of attendance (better for teaching concepts)
    filter(!is.na(coa_grad_res))
load(file = url('https://github.com/anyone-can-cook/educ152/raw/main/data/college_scorecard/output_data/df_debt_earn_mba.Rdata'))
df_ipeds_pop %>% head()
## # A tibble: 6 x 38
##   instnm                              unitid opeid6 opeid                       control                                               c15basic stabbr       city       zip                    locale                                            region tuit_grad_res fee_grad_res tuit_grad_nres fee_grad_nres tuit_md_res fee_md_res tuit_md_nres fee_md_nres tuit_law_res fee_law_res tuit_law_nres fee_law_nres books_supplies roomboard_off oth_expense_off tuitfee_grad_res tuitfee_grad_nres tuitfee_md_res tuitfee_md_nres tuitfee_law_res tuitfee_law_nres coa_grad_res coa_grad_nres coa_md_res coa_md_nres coa_law_res coa_law_nres
##   <chr>                                <dbl> <chr>  <chr>                     <dbl+lbl>                                              <dbl+lbl> <chr+lbl>    <chr>      <chr>               <dbl+lbl>                                         <dbl+lbl>         <dbl>        <dbl>          <dbl>         <dbl>       <dbl>      <dbl>        <dbl>       <dbl>        <dbl>       <dbl>         <dbl>        <dbl>          <dbl>         <dbl>           <dbl>            <dbl>             <dbl>          <dbl>           <dbl>           <dbl>            <dbl>        <dbl>         <dbl>      <dbl>       <dbl>       <dbl>        <dbl>
## 1 Alabama A & M University            100654 001002 00100200 1 [Public]                 18 [Master^s Colleges & Universities: Larger Programs] AL [Alabama] Normal     35762      12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV]          8130         1236          16260          1236          NA         NA           NA          NA           NA          NA            NA           NA           1600          8830            3090             9366             17496             NA              NA              NA               NA        22886         31016         NA          NA          NA           NA
## 2 University of Alabama at Birmingham 100663 001052 00105200 1 [Public]                 15 [Doctoral Universities: Highest Research Activity]  AL [Alabama] Birmingham 35294-0110 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV]          7588            0          17294             0       26778          0        61848           0           NA          NA            NA           NA           1200         11682            4886             7588             17294          26778           61848              NA               NA        25356         35062      44546       79616          NA           NA
## 3 Amridge University                  100690 025034 02503400 2 [Private not-for-profit] 20 [Master^s Colleges & Universities: Small Programs]  AL [Alabama] Montgomery 36117-3553 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV]         11700         1390          11700          1390          NA         NA           NA          NA           NA          NA            NA           NA           1500          9600            1600            13090             13090             NA              NA              NA               NA        25790         25790         NA          NA          NA           NA
## 4 University of Alabama in Huntsville 100706 001055 00105500 1 [Public]                 16 [Doctoral Universities: Higher Research Activity]   AL [Alabama] Huntsville 35899      12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV]          9834          508          21830           508          NA         NA           NA          NA           NA          NA            NA           NA           1688          9603            3578            10342             22338             NA              NA              NA               NA        25211         37207         NA          NA          NA           NA
## 5 Alabama State University            100724 001005 00100500 1 [Public]                 19 [Master^s Colleges & Universities: Medium Programs] AL [Alabama] Montgomery 36104-0271 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV]          6174         2284          12348          2284          NA         NA           NA          NA           NA          NA            NA           NA           1600          7320            4228             8458             14632             NA              NA              NA               NA        21606         27780         NA          NA          NA           NA
## 6 The University of Alabama           100751 001051 00105100 1 [Public]                 16 [Doctoral Universities: Higher Research Activity]   AL [Alabama] Tuscaloosa 35487-0166 13 [City: Small]   5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV]         10470            0          26950             0       26778          0        61848           0        22760           0         38820            0           1200         13050            4116            10470             26950          26778           61848           22760            38820        28836         45316      45144       80214       41126        57186
df_generated_pop %>% head()
##   stdnorm_dist norm_dist lskew_dist rskew_dist
## 1  -1.38507062  43.07465  0.9172827 0.08271725
## 2   0.03832318  50.19162  0.7064826 0.29351743
## 3  -0.76303016  46.18485  0.8444117 0.15558827
## 4   0.21230614  51.06153  0.6694614 0.33053865
## 5   1.42553797  57.12769  0.3488487 0.65115131
## 6   0.74447982  53.72240  0.5401472 0.45985283
df_socialwork %>% head()
## # A tibble: 6 x 35
##   opeid6 unitid instnm                                            control                                                 ccbasic stabbr                                                         region city                                                                                                  locale cipdig2 cipcode cipdesc                  credlev creddesc        ipedscount1 ipedscount2 debt_all_stgp_eval_n debt_all_stgp_eval_mean debt_all_stgp_eval_mdn debt_all_stgp_eval_mdn10yrpay earn_count_wne_hi_1yr earn_mdn_hi_1yr earn_count_wne_hi_2yr earn_mdn_hi_2yr tuit_grad_res fee_grad_res tuit_grad_nres fee_grad_nres books_supplies roomboard_off oth_expense_off tuitfee_grad_res tuitfee_grad_nres coa_grad_res coa_grad_nres
##   <chr>   <dbl> <chr>                                           <dbl+lbl>                                               <dbl+lbl> <chr>                                                       <dbl+lbl> <chr>                                                                                              <dbl+lbl> <chr>   <chr>   <chr>                  <dbl+lbl> <chr>           <chr>       <chr>                      <dbl>                   <dbl>                  <dbl>                         <dbl>                 <dbl>           <dbl>                 <dbl>           <dbl>         <dbl>        <dbl>          <dbl>         <dbl>          <dbl>         <dbl>           <dbl>            <dbl>             <dbl>        <dbl>         <dbl>
## 1 001002 100654 Alabama A & M University       1 [Public]                 18 [Master's Colleges & Universities: Larger Programs]  AL     5 [Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC, TN, VA, WV)] Normal     12 [City: Midsize (population of at least 100,000 but less than 250,000)]                         44      4407    Social Work. 5 [Master's Degree] Master's Degree 92          79                           146                   51788                  50446                           518                   138           38065                    83           37819          8130         1236          16260          1236           1600          8830            3090             9366             17496        22886         31016
## 2 001005 100724 Alabama State University       1 [Public]                 19 [Master's Colleges & Universities: Medium Programs]  AL     5 [Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC, TN, VA, WV)] Montgomery 12 [City: Midsize (population of at least 100,000 but less than 250,000)]                         44      4407    Social Work. 5 [Master's Degree] Master's Degree 7           25                            NA                   38001                  41000                           421                    NA              NA                    NA              NA          6174         2284          12348          2284           1600          7320            4228             8458             14632        21606         27780
## 3 001051 100751 The University of Alabama      1 [Public]                 15 [Doctoral Universities: Very High Research Activity] AL     5 [Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC, TN, VA, WV)] Tuscaloosa 12 [City: Midsize (population of at least 100,000 but less than 250,000)]                         44      4407    Social Work. 5 [Master's Degree] Master's Degree 189         225                          244                   32830                  30750                           316                   276           39254                   292           38981         10470            0          26950             0           1200         13050            4116            10470             26950        28836         45316
## 4 001036 102049 Samford University             2 [Private not-for-profit] 17 [Doctoral/Professional Universities]                 AL     5 [Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC, TN, VA, WV)] Birmingham 21 [Suburb: Large (outside principal city, in urbanized area with population of 250,000 or more)] 44      4407    Social Work. 5 [Master's Degree] Master's Degree 8           13                            17                   49278                     NA                            NA                    NA              NA                    NA              NA         18530          610          18530           610           1000          9850            5498            19140             19140        35488         35488
## 5 001047 102368 Troy University                1 [Public]                 18 [Master's Colleges & Universities: Larger Programs]  AL     5 [Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC, TN, VA, WV)] Troy       33 [Town: Remote (in urban cluster more than 35 miles from an urbanized area)]                    44      4407    Social Work. 5 [Master's Degree] Master's Degree 106         86                           145                   30072                  23141                           238                   104           39609                    20           38289          7146          802          14292           802           1129          5815            5227             7948             15094        20119         27265
## 6 011462 102553 University of Alaska Anchorage 1 [Public]                 18 [Master's Colleges & Universities: Larger Programs]  AK     8 [Far West (AK, CA, HI, NV, OR, WA)]                          Anchorage  11 [City: Large (population of 250,000 or more)]                                                  44      4407    Social Work. 5 [Master's Degree] Master's Degree 20          19                            20                   30364                  26831                           275                    24           50606                    27           51750          9768         1424          19954          1758           1608         11728            7568            11192             21712        32096         42616

2 Plot population distribution

# Function to get sampling distribution (default: 1000 samples of size 200)
get_sampling_distribution <- function(data_vec, num_samples = 1000, sample_size = 200) {
  sample_means <- vector(mode = 'numeric', num_samples)

  for (i in 1:length(sample_means)) {
    samp <- sample(data_vec, sample_size)
    sample_means[[i]] <- mean(samp, na.rm = T)
  }

  sample_means
}

# Function to generate plot
plot_distribution <- function(data_vec1, data_vec2 = NULL, data_df = NULL, data_var = NULL, group_var = NULL, group_cat = NULL, pop_labels = NULL, show_group_hist = F, sampling_dist = F, plot_title = '') {
  
  two_pop <- !is.null(group_var) || !is.null(data_vec2)
  two_dist <- two_pop && !sampling_dist
  
  # Prep dataframe
  if (!is.null(data_df)) {
    data_df[[data_var]] <- unclass(data_df[[data_var]])  # unclass haven_labelled
    if (two_pop) {
      data_df[[group_var]] <- unclass(data_df[[group_var]])
    }
    
    data_df <- data_df %>% filter(!is.na(get(data_var)))  # remove NA rows
    
    # If group_cat not provided, use 2 values from group_var
    if (two_pop && is.null(group_cat)) {
      group_cat <- sort(unique(na.omit(data_df[[group_var]])))[1:2]
    }
    
    # Create population vector(s)
    if (!two_pop) {  # single population
      data_vec1 <- data_df[[data_var]]
    } else {  # two populations
      data_vec1 <- (data_df %>% filter(get(group_var) == group_cat[[1]]))[[data_var]]
      data_vec2 <- (data_df %>% filter(get(group_var) == group_cat[[2]]))[[data_var]]
    }
  }

  # Get sampling distribution
  if (sampling_dist) {
    if (!two_pop) {  # single population
      data_vec1 <- get_sampling_distribution(data_vec1)
    } else {  # two populations
      data_vec1_samp <- get_sampling_distribution(data_vec1)
      data_vec2_samp <- get_sampling_distribution(data_vec2)
      
      data_vec1 <- data_vec1_samp - data_vec2_samp
    }
  }
  
  # Legend text
  legend_text <- c(paste('Mean:', round(mean(data_vec1), 2),
                         '\nStd Dev:', round(sd(data_vec1), 2)),
                   paste('Median:', round(median(data_vec1), 2)))
  
  if (two_dist) {
    legend_text <- c(legend_text, 
                     paste('Mean:', round(mean(data_vec2), 2),
                           '\nStd Dev:', round(sd(data_vec2), 2)),
                     paste('Median:', round(median(data_vec2), 2)))
  }
  
  if (!is.null(pop_labels)) {
    pop1 <- pop_labels[[1]]
    pop2 <- pop_labels[[2]]
  } else if (!is.null(group_var)) {
    pop1 <- str_c(group_var, '=', group_cat[[1]])
    pop2 <- str_c(group_var, '=', group_cat[[2]])
  } else {
    pop1 <- 'Pop1'
    pop2 <- 'Pop2'
  }
  
  # Create statistics dataframe
  if (!two_dist) {
    lines_vec <- c('dotted')
    stats_vec <- c(mean(data_vec1), median(data_vec1))
    legend_title <- 'Statistics'
    
    if (two_pop && sampling_dist) {
      legend_title <- str_c('Statistics\n(', pop1, ' - ', pop2, ')')
    }
  } else {
    lines_vec <- c('dotted', 'dotdash')
    stats_vec <- c(mean(data_vec1), median(data_vec1), mean(data_vec2), median(data_vec2))
    legend_title <- str_c('Statistics\n(', pop1, ' vs. ', pop2, ')')
  }
  
  stats_df <- data.frame(
    pop = rep(lines_vec, each = 2),
    stat = rep(c('blue', 'red'), times = if_else(two_dist, 2, 1)),
    val = stats_vec
  )
  stats_df$pop <- factor(stats_df$pop, levels = c('dotted', 'dotdash'))
  
  # Plot distribution(s)
  p <- ggplot() +
    ggtitle(plot_title) + xlab('') + ylab('') +
    geom_density(aes(x = data_vec1), alpha = 0.8)
  
  if (!two_dist || show_group_hist) {  # show histogram only if 1 pop or show_group_hist is TRUE
    p <- p +
      geom_histogram(aes(x = data_vec1, y = ..density..), alpha = 0.4, position = 'identity')
  }
  
  if (two_dist) {
    p <- p +
      geom_density(aes(x = data_vec2), alpha = 0.8)
    
    if (show_group_hist) {  # show histogram only if show_group_hist is TRUE
      p <- p +
        geom_histogram(aes(x = data_vec2, y = ..density..), alpha = 0.4, position = 'identity', fill = 'wheat4')
    }
  }
  
  p <- p +
    geom_vline(data = stats_df,
               aes(xintercept = val, color = interaction(stat, pop), linetype = interaction(stat, pop)),
               size = 0.6, alpha = 0.8) +
    scale_color_manual(name = legend_title,
                       labels = legend_text,
                       values = as.character(stats_df$stat)) +
    scale_linetype_manual(name = legend_title,
                          labels = legend_text,
                          values = as.character(stats_df$pop)) +
    theme(plot.title = element_text(size = 10, face = 'bold', hjust = 0.5),
          legend.title = element_text(size = 9, face = 'bold'),
          legend.text = element_text(size = 8)) +
    guides(col = guide_legend(ncol = if_else(two_dist, 2, 1)))
  
  p
}

2.1 Single population

# IPEDS data: tuitfee_grad_res
plot_distribution(df_ipeds_pop$tuitfee_grad_res)

# IPEDS data: tuitfee_grad_res
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res')

2.2 Two populations

# IPEDS data: tuitfee_grad_res by control (default: use first 2 categorical values in group_var)
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', group_var = 'control')

# IPEDS data: tuitfee_grad_res by control, shade histogram
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', group_var = 'control',
                  show_group_hist = T)

# IPEDS data: tuitfee_grad_res by control, custom order
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', group_var = 'control',
                  group_cat = c(2, 1))

# IPEDS data: tuitfee_grad_res by control, custom categories
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', group_var = 'control',
                  group_cat = c(2, 3))

# Alternatively, prepare 2 populations first to pass in
pop1 <- (df_ipeds_pop %>% filter(unclass(control) == 1))$tuitfee_grad_res
pop2 <- (df_ipeds_pop %>% filter(unclass(control) == 2))$tuitfee_grad_res

plot_distribution(pop1, pop2)

# Custom labels
plot_distribution(pop1, pop2, pop_labels = c('Public', 'Private not-for-profit'))

plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', group_var = 'control',
                  group_cat = c(1, 2), pop_labels = c('Public', 'Private not-for-profit'))

3 Plot single random sample

# Take single random sample of size 200
set.seed(124)
df_generated_sample <- df_generated_pop[sample(nrow(df_generated_pop), 200), ]

set.seed(124)
df_ipeds_sample <- df_ipeds_pop[sample(nrow(df_ipeds_pop), 200), ]

# Randomly generated data: standard normal
plot_distribution(df_generated_sample$stdnorm_dist)

4 Plot sampling distribution

4.1 Single population

# IPEDS data: tuitfee_grad_res
plot_distribution(df_ipeds_pop$tuitfee_grad_res, sampling_dist = T)

# IPEDS data: tuitfee_grad_res
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', sampling_dist = T) # sampling_dist = T runs the get_sampling_distribution function and plots the sampling distribution

plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', sampling_dist = F) # sampling_dist = F plots underlying variable

4.2 Two populations

# IPEDS data: tuitfee_grad_res by control (control=1 minus control=2)
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', group_var = 'control',
                  sampling_dist = T, pop_labels = c('Public', 'Private'))

# IPEDS data: tuitfee_grad_res by control (control=2 minus control=1)
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', group_var = 'control',
                  group_cat = c(2, 1), sampling_dist = T, pop_labels = c('Private', 'Public'))

5 Hypothesis testing

# Function to generate t-distribution plot
plot_t_distribution <- function(data_vec1, data_vec2 = NULL, data_df = NULL, data_var = NULL, group_var = NULL, group_cat = NULL, mu = 0, alpha = 0.05, alternative = 'two.sided', plot_title = '', shade_rejection = T, shade_pval = T, stacked = F, beta_y = NULL, beta_x = NULL, beta_x_cat = 1) {
  
  two_pop <- !is.null(group_var) || !is.null(data_vec2)
  
  # Prep dataframe
  if (!is.null(data_df) && is.null(beta_y)) {
    data_df[[data_var]] <- unclass(data_df[[data_var]])  # unclass haven_labelled
    if (two_pop) {
      data_df[[group_var]] <- unclass(data_df[[group_var]])
    }
    
    data_df <- data_df %>% filter(!is.na(get(data_var)))  # remove NA rows
    
    # If group_cat not provided, use 2 values from group_var
    if (two_pop && is.null(group_cat)) {
      group_cat <- sort(unique(na.omit(data_df[[group_var]])))[1:2]
    }
    
    # Create samples
    if (!two_pop) {  # single sample
      data_vec1 <- data_df[[data_var]]
    } else {  # two samples
      data_vec1 <- (data_df %>% filter(get(group_var) == group_cat[[1]]))[[data_var]]
      data_vec2 <- (data_df %>% filter(get(group_var) == group_cat[[2]]))[[data_var]]
    }
  }
  
  # Calculate stats
  if (!is.null(beta_y)) {  # beta testing
    mod <- summary(lm(formula = get(beta_y) ~ get(beta_x), data = data_df))
    
    idx <- beta_x_cat + 1
    
    deg_freedom <- mod$df[[2]]
    std_err <- mod$coefficients[[idx, 'Std. Error']]
    t <- mod$coefficients[[idx, 't value']]
  } else if (!two_pop) {  # single sample
    data_vec1 <- na.omit(data_vec1)
    
    # Calculate t-statistics
    sample_size <- length(data_vec1)
    deg_freedom <- sample_size - 1
    xbar <- mean(data_vec1)
    s <- sd(data_vec1)
    
    std_err <- s / sqrt(sample_size)
    t <- (xbar - mu) / std_err
  } else {  # two samples
    data_vec1 <- na.omit(data_vec1)
    data_vec2 <- na.omit(data_vec2)
    
    # Calculate t-statistics
    xbar1 <- mean(data_vec1)
    xbar2 <- mean(data_vec2)
    s1 <- sd(data_vec1)
    s2 <- sd(data_vec2)
    n1 <- length(data_vec1)
    n2 <- length(data_vec2)
    
    deg_freedom <- (s1**2/n1 + s2**2/n2)**2 / ((s1**2/n1)**2/(n1-1) + (s2**2/n2)**2/(n2-1))
    std_err <- sqrt(s1**2/n1 + s2**2/n2)
    t <- (xbar1 - xbar2) / std_err
  }
  
  # Calculate critical value and p-value
  if (alternative == 'less') {  # left-tailed
    cv_lower <- qt(p = alpha, df = deg_freedom, lower.tail = T)
    cv_legend <- round(cv_lower, 2)
    cv_legend2 <- round(cv_lower * std_err + mu, 2)
    pval <- round(pt(q = t, df = deg_freedom, lower.tail = T), 4)
  } else if (alternative == 'greater') {  # right-tailed
    cv_upper <- qt(p = alpha, df = deg_freedom, lower.tail = F)
    cv_legend <- round(cv_upper, 2)
    cv_legend2 <- round(cv_upper * std_err + mu, 2)
    pval <- round(pt(q = t, df = deg_freedom, lower.tail = F), 4)
  } else {  # two-tailed
    cv_lower <- qt(p = alpha / 2, df = deg_freedom, lower.tail = T)
    cv_upper <- qt(p = alpha / 2, df = deg_freedom, lower.tail = F)
    cv_legend <- str_c('\u00B1', round(cv_upper, 2))
    cv_legend2 <- str_c(round(cv_lower * std_err + mu, 2), ' & ', round(cv_upper * std_err + mu, 2))
    pval_half <- round(pt(q = t, df = deg_freedom, lower.tail = t < 0), 4)
    pval <- str_c(pval_half, ' + ', pval_half, ' = ', 2 * pval_half)
  }
  
  # Plot t-distribution
  p <- ggplot(data.frame(x = -c(-4, 4)), aes(x)) +
    ggtitle(plot_title) + xlab('') + ylab('') +
    stat_function(fun = dt, args = list(df = deg_freedom), xlim = c(-4, 4))
  
  # Shade rejection region using critical value
  if (alternative != 'greater') {
    p <- p + geom_vline(aes(xintercept = cv_lower, color = 'cval'),
                        linetype = 'dotted', size = 0.8, alpha = 0.8)
    
    if (shade_rejection) {
      p <- p + stat_function(fun = dt, args = list(df = deg_freedom),
                             xlim = c(-4, cv_lower),
                             geom = 'area', alpha = 0.3, fill = 'red')
    }
    
    if (shade_pval) {
      p <- p + stat_function(fun = dt, args = list(df = deg_freedom),
                             xlim = c(-4, if_else(alternative == 'two.sided', -abs(t), t)),
                             geom = 'area', alpha = 0.3, fill = 'blue')
    }
  }
  if (alternative != 'less') {
    p <- p + geom_vline(aes(xintercept = cv_upper, color = 'cval'),
                        linetype = 'dotted', size = 0.8, alpha = 0.8)
    
    if (shade_rejection) {
      p <- p + stat_function(fun = dt, args = list(df = deg_freedom),
                             xlim = c(cv_upper, 4),
                             geom = 'area', alpha = 0.3, fill = 'red')
    }
    
    if (shade_pval) {
      p <- p + stat_function(fun = dt, args = list(df = deg_freedom),
                             xlim = c(if_else(alternative == 'two.sided', abs(t), t), 4),
                             geom = 'area', alpha = 0.3, fill = 'blue')
    }
  }
  
  # Legend text
  legend_text <- c('t-statistics / p-value', 'critical value / alpha')
  
  if (stacked) {
    legend_text <- c(str_c('t-statistics: ', round(t, 2),
                     '\n(p-value: ', str_extract(pval, '[\\d.-]+$'), ')'),
                     str_c('Critical value: ', cv_legend,
                     '\n(alpha: ', round(alpha, 2), ')'))
  }
  
  stats_text <- c(str_c('t-statistics: ', round(t, 2)),
                  str_c('SE: ', round(std_err, 2)),
                  str_c('p-val: ', pval),
                  str_c('Critical value: ', cv_legend),
                  str_c('alpha: ', round(alpha, 2)))
  
  if (!stacked) {
    p <- p +
      annotate('text', size = 9*5/14, x = 4.84, y = 0.14, hjust = 0,
               label = 'bold(Statistics)', parse = T) +
      annotate('text', size = 8*5/14, x = 4.89, y = 0:4 * -0.015 + 0.12, hjust = 0,
               label = stats_text)
  }
  
  # Label plot
  p <- p +
    geom_vline(aes(xintercept = t, color = 'tstat'),
               linetype = 'dotted', size = 0.8, alpha = 0.8) +
    scale_x_continuous(sec.axis = sec_axis(trans = ~ . * std_err + mu)) +
    scale_color_manual(name = if_else(stacked, 'Statistics', 'Legend'),
                       breaks = c('tstat', 'cval'),
                       labels = legend_text,
                       values = c(tstat = 'blue', cval = 'red')) +
    theme(plot.title = element_text(size = 10, face = 'bold', hjust = 0.5),
          plot.margin = unit(c(5.5, if_else(stacked, 5.5, 30), 5.5, 5.5), 'pt'),
          legend.title = element_text(size = 9, face = 'bold'),
          legend.text = element_text(size = 8)) +
    coord_cartesian(xlim = c(-4, 4),
                    clip = 'off')

  p
}

5.1 Single population

# H0: population mean of coa_grad_res is $29,000
# Ha: population mean of coa_grad_res is NOT $29,000 (two-sided test)
# Verdict: Fail to reject H0
t.test(df_ipeds_sample$coa_grad_res, mu = 29000)
## 
##  One Sample t-test
## 
## data:  df_ipeds_sample$coa_grad_res
## t = -1.7293, df = 183, p-value = 0.08545
## alternative hypothesis: true mean is not equal to 29000
## 95 percent confidence interval:
##  26313.28 29176.89
## sample estimates:
## mean of x 
##  27745.08
plot_t_distribution(df_ipeds_sample$coa_grad_res, mu = 29000)

# H0: population mean of coa_grad_res is $32,000
# Ha: population mean of coa_grad_res is NOT $32,000 (two-sided test)
# Verdict: Reject H0
t.test(df_ipeds_sample$coa_grad_res, mu = 32000)
## 
##  One Sample t-test
## 
## data:  df_ipeds_sample$coa_grad_res
## t = -5.8632, df = 183, p-value = 0.0000000208
## alternative hypothesis: true mean is not equal to 32000
## 95 percent confidence interval:
##  26313.28 29176.89
## sample estimates:
## mean of x 
##  27745.08
plot_t_distribution(data_df = df_ipeds_sample, data_var = 'coa_grad_res', mu = 32000)

# H0: population mean of coa_grad_res is $33,000
# Ha: population mean of coa_grad_res is LESS THAN $33,000 (left-sided test)
# Verdict: Reject H0
t.test(df_ipeds_sample$coa_grad_res, mu = 33000, alternative = 'less')
## 
##  One Sample t-test
## 
## data:  df_ipeds_sample$coa_grad_res
## t = -7.2412, df = 183, p-value = 0.000000000005962
## alternative hypothesis: true mean is less than 33000
## 95 percent confidence interval:
##      -Inf 28944.82
## sample estimates:
## mean of x 
##  27745.08
plot_t_distribution(df_ipeds_sample$coa_grad_res, mu = 33000, alternative = 'less')

# H0: population mean of coa_grad_res is $31,000
# Ha: population mean of coa_grad_res is GREATER THAN $31,000 (right-sided test)
# Verdict: Fail to reject H0
t.test(df_ipeds_sample$coa_grad_res, mu = 31000, alternative = 'greater')
## 
##  One Sample t-test
## 
## data:  df_ipeds_sample$coa_grad_res
## t = -4.4852, df = 183, p-value = 1
## alternative hypothesis: true mean is greater than 31000
## 95 percent confidence interval:
##  26545.35      Inf
## sample estimates:
## mean of x 
##  27745.08
plot_t_distribution(df_ipeds_sample$coa_grad_res, mu = 31000, alternative = 'greater')

5.2 Two populations

# H0: population mean of tuitfee_grad_res is equal for control=1 and control=2
# Ha: population means are NOT equal
# Verdict: Reject H0
t.test(formula = tuitfee_grad_res ~ control, data = df_ipeds_sample, 
       subset = unclass(control) %in% c(1, 2))
## 
##  Welch Two Sample t-test
## 
## data:  tuitfee_grad_res by control
## t = -7.1793, df = 111.77, p-value = 0.0000000000823
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -11070.409  -6281.462
## sample estimates:
## mean in group 1 mean in group 2 
##        9270.575       17946.510
# Here, the t-statistics line is off the grid
plot_t_distribution(data_df = df_ipeds_sample, data_var = 'tuitfee_grad_res',
                    group_var = 'control', group_cat = c(1, 2))

# Alternatively, prepare 2 samples first to pass in
sample1 <- (df_ipeds_sample %>% filter(unclass(control) == 1))$tuitfee_grad_res
sample2 <- (df_ipeds_sample %>% filter(unclass(control) == 2))$tuitfee_grad_res

t.test(sample1, sample2)
## 
##  Welch Two Sample t-test
## 
## data:  sample1 and sample2
## t = -7.1793, df = 111.77, p-value = 0.0000000000823
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -11070.409  -6281.462
## sample estimates:
## mean of x mean of y 
##  9270.575 17946.510
plot_t_distribution(sample1, sample2)

# compare locale = tuitfee_grad_nres between public and private
plot_t_distribution(data_df = df_ipeds_sample, data_var = 'coa_grad_nres',
                    group_var = 'control', group_cat = c(1, 2))

# compare locale = city-large (11) to suburb-large (21)
plot_t_distribution(data_df = df_ipeds_sample, data_var = 'roomboard_off',
                    group_var = 'locale', group_cat = c(11, 21))

5.3 Bivariate regression

summary(lm(formula = debt_all_stgp_eval_mean ~ coa_grad_res, data = df_socialwork))
## 
## Call:
## lm(formula = debt_all_stgp_eval_mean ~ coa_grad_res, data = df_socialwork)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -25799  -5991   -926   5357  42453 
## 
## Coefficients:
##                 Estimate  Std. Error t value             Pr(>|t|)    
## (Intercept)  13445.93950  1830.24898   7.347     0.00000000000285 ***
## coa_grad_res     0.96869     0.06069  15.962 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9435 on 251 degrees of freedom
## Multiple R-squared:  0.5038, Adjusted R-squared:  0.5018 
## F-statistic: 254.8 on 1 and 251 DF,  p-value: < 0.00000000000000022
plot_t_distribution(beta_y = 'debt_all_stgp_eval_mean', beta_x = 'coa_grad_res', data_df = df_socialwork)

5.4 Categorical X

# 2 categories
summary(lm(formula = earn_mdn_hi_2yr ~ control, data = df_mba_fac))
## 
## Call:
## lm(formula = earn_mdn_hi_2yr ~ control, data = df_mba_fac)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -38111 -13099  -4780   8208 119148 
## 
## Coefficients:
##                               Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)                      72104       1227  58.771 <0.0000000000000002 ***
## controlPrivate not-for-profit    -1688       1683  -1.003               0.316    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20640 on 602 degrees of freedom
##   (61 observations deleted due to missingness)
## Multiple R-squared:  0.001668,   Adjusted R-squared:  9.24e-06 
## F-statistic: 1.006 on 1 and 602 DF,  p-value: 0.3164
plot_t_distribution(beta_y = 'earn_mdn_hi_2yr', beta_x = 'control', data_df = df_mba_fac)

# More than 2 categories
summary(lm(formula = earn_mdn_hi_2yr ~ urban, data = df_mba_fac))
## 
## Call:
## lm(formula = earn_mdn_hi_2yr ~ urban, data = df_mba_fac)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -44625 -12659  -3672   8778 118233 
## 
## Coefficients:
##                     Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)            77303       1555  49.700 < 0.0000000000000002 ***
## urbanmed/small city    -5972       2143  -2.787              0.00549 ** 
## urbansuburb            -7049       2253  -3.128              0.00184 ** 
## urbantown/rural       -15148       2550  -5.942        0.00000000479 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20100 on 600 degrees of freedom
##   (61 observations deleted due to missingness)
## Multiple R-squared:  0.05629,    Adjusted R-squared:  0.05157 
## F-statistic: 11.93 on 3 and 600 DF,  p-value: 0.0000001349
# Default would plot first comparison category (med/small city)
plot_t_distribution(beta_y = 'earn_mdn_hi_2yr', beta_x = 'urban', data_df = df_mba_fac)

plot_t_distribution(beta_y = 'earn_mdn_hi_2yr', beta_x = 'urban', data_df = df_mba_fac, beta_x_cat = 1)  # equivalent

# Specify second comparison category
plot_t_distribution(beta_y = 'earn_mdn_hi_2yr', beta_x = 'urban', data_df = df_mba_fac, beta_x_cat = 2)

# Specify third comparison category
plot_t_distribution(beta_y = 'earn_mdn_hi_2yr', beta_x = 'urban', data_df = df_mba_fac, beta_x_cat = 3)

6 Combine plots

library(patchwork)

# Use / to place plots 1 per row (stacked)
plot_distribution(df_generated_pop$norm_dist, plot_title = 'Population distribution') /
  plot_distribution(df_generated_sample$norm_dist, plot_title = 'Single sample distribution') /
  plot_distribution(df_generated_pop$norm_dist, sampling_dist = T,
                    plot_title = 'Sampling distribution')

# Or use + and plot_layout()
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', group_var = 'control',
                  group_cat = c(2, 1), plot_title = 'Population distributions') +
  plot_distribution(data_df = df_ipeds_sample, data_var = 'tuitfee_grad_res', group_var = 'control',
                    group_cat = c(2, 1), plot_title = 'Single sample distributions') +
  plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', group_var = 'control',
                    group_cat = c(2, 1), sampling_dist = T,
                    plot_title = 'Sampling distribution differences') +
  plot_layout(ncol = 1)

# When including t-distribution plot, need to specify stacked = T
plot_distribution(df_ipeds_sample$coa_grad_res, plot_title = 'Population distribution') +
  plot_distribution(df_ipeds_sample$coa_grad_res, plot_title = 'Single sample distribution') +
  plot_t_distribution(df_ipeds_sample$coa_grad_res, mu = 29000, stacked = T,
                      plot_title = 'Sampling distribution under null') +
  plot_layout(ncol = 1)